In [29]:
# !pip install polars pandas plotly pyarrow nbformat
In [30]:
import polars as pl
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
In [31]:
dataset = pl.read_csv(
    './data/imdb.tsv',
    separator='\t',
    null_values="\\N",
    quote_char=None
)

rating_dataset = pl.read_csv(
    './data/imdb_rating.tsv',
    separator='\t',
    null_values="\\N",
    quote_char=None
)
In [32]:
# theme = 'plotly_dark'
theme = 'plotly_white'
In [33]:
rating_preference = 2
In [34]:
df = dataset.join(rating_dataset, on='tconst', how='inner')
df
Out[34]:
shape: (1_306_106, 11)
tconsttitleTypeprimaryTitleoriginalTitleisAdultstartYearendYearruntimeMinutesgenresaverageRatingnumVotes
strstrstrstri64i64stri64strf64i64
"tt0000001""short""Carmencita""Carmencita"01894null1"Documentary,Short"5.71966
"tt0000002""short""Le clown et ses chiens""Le clown et ses chiens"01892null5"Animation,Short"5.8264
"tt0000003""short""Pauvre Pierrot""Pauvre Pierrot"01892null4"Animation,Comedy,Romance"6.51810
"tt0000004""short""Un bon bock""Un bon bock"01892null12"Animation,Short"5.6178
"tt0000005""short""Blacksmith Scene""Blacksmith Scene"01893null1"Comedy,Short"6.22610
……………………………
"tt9916730""movie""6 Gunn""6 Gunn"02017null116null8.310
"tt9916766""tvEpisode""Episode #10.15""Episode #10.15"02019null43"Family,Game-Show,Reality-TV"7.021
"tt9916778""tvEpisode""Escape""Escape"02019nullnull"Crime,Drama,Mystery"7.236
"tt9916840""tvEpisode""Horrid Henry's Comic Caper""Horrid Henry's Comic Caper"02014null11"Adventure,Animation,Comedy"8.86
"tt9916880""tvEpisode""Horrid Henry Knows It All""Horrid Henry Knows It All"02014null10"Adventure,Animation,Comedy"8.26
In [35]:
df.schema
Out[35]:
Schema([('tconst', String),
        ('titleType', String),
        ('primaryTitle', String),
        ('originalTitle', String),
        ('isAdult', Int64),
        ('startYear', Int64),
        ('endYear', String),
        ('runtimeMinutes', Int64),
        ('genres', String),
        ('averageRating', Float64),
        ('numVotes', Int64)])
Type Frequency¶
In [36]:
# Assuming df is your DataFrame
type_count = (
    df.lazy()
    .group_by('titleType')
    .agg([
        pl.len().alias('count')  # Only include the count aggregation
    ]).sort("count", descending=True)
)

# Collecting the result
type_count.collect()
Out[36]:
shape: (10, 2)
titleTypecount
stru32
"tvEpisode"638904
"movie"290903
"short"146929
"tvSeries"87044
"tvMovie"50410
"video"49288
"tvMiniSeries"14842
"videoGame"14528
"tvSpecial"11069
"tvShort"2189
In [37]:
type_count_in_df = type_count.collect().to_pandas()

# Create a bar chart using Plotly
fig = px.bar(type_count_in_df, x="titleType", y="count", title="Count of Listed Categories", labels={"titleType": "Category", "count": "Count"})

fig.update_layout(template=theme)
fig.show(renderer='notebook')
Genres Frequency¶
In [38]:
genres_count = (
    df.lazy()
    .select(pl.col("genres").str.split(","))
    .explode("genres")
    .group_by("genres")
    .agg([pl.len().alias("count")])
    .sort("count", descending=True)
)

genres_count.collect()
Out[38]:
shape: (29, 2)
genrescount
stru32
"Drama"440604
"Comedy"410433
"Action"162759
"Documentary"162438
"Animation"156700
……
"News"17817
"Western"15948
"War"14247
"Musical"12595
"Film-Noir"882
In [39]:
genres_count_in_df = genres_count.collect().to_pandas()

# Create a bar chart using Plotly
fig = px.bar(genres_count_in_df, x="genres", y="count", title="Count of Listed Genres", labels={"genres": "Genres", "count": "Count"})
# Apply dark theme
fig.update_layout(template=theme)
fig.show()
Top Few Titles¶
In [40]:
top_few_movies = (
    df.lazy()
    .select(["primaryTitle", "titleType", "averageRating", "numVotes", (pl.col('averageRating') ** rating_preference * pl.col('numVotes') / 1000).alias("weightedRating")])
    .sort("weightedRating", descending=True)
    .limit(25)
)

top_few_movies.collect()
Out[40]:
shape: (25, 5)
primaryTitletitleTypeaverageRatingnumVotesweightedRating
strstrf64i64f64
"The Shawshank Redemption""movie"9.32732446236329.25454
"The Dark Knight""movie"9.02705636219156.516
"Inception""movie"8.82401515185973.3216
"Game of Thrones""tvSeries"9.22154178182329.62592
"Breaking Bad""tvSeries"9.51964594177304.6085
……………
"The Godfather Part II""movie"9.01295390104926.59
"Saving Private Ryan""movie"8.61417607104846.21372
"Star Wars: Episode IV - A New …"movie"8.61387837102644.42452
"Inglourious Basterds""movie"8.31484315102254.46035
"Batman Begins""movie"8.21497020100659.6248
Top Few Titles by Type¶
In [41]:
top_few_movies_by_types = (
    df.lazy()
    .filter(pl.col("titleType") == "tvSeries")
    .select(["primaryTitle", "titleType", "averageRating", "numVotes", (pl.col('averageRating') ** rating_preference * pl.col('numVotes') / 1000).alias("weightedRating")])
    .sort("weightedRating", descending=True)
    .limit(25)
)

top_few_movies_by_types.collect()
Out[41]:
shape: (25, 5)
primaryTitletitleTypeaverageRatingnumVotesweightedRating
strstrf64i64f64
"Game of Thrones""tvSeries"9.22154178182329.62592
"Breaking Bad""tvSeries"9.51964594177304.6085
"Stranger Things""tvSeries"8.7123606493557.68416
"Friends""tvSeries"8.9102210980961.25389
"Sherlock""tvSeries"9.194711178430.26191
……………
"Prison Break""tvSeries"8.354966137866.14629
"Westworld""tvSeries"8.551434837161.643
"House""tvSeries"8.747687536094.66875
"The Sopranos""tvSeries"9.241997335546.51472
"Narcos""tvSeries"8.843932334021.17312
Top Few Titles by Genres¶
In [42]:
# top few titles by genres

top_few_movies_by_genres = (
    df.lazy()
    .filter(pl.col("genres").str.contains("Crime"))
    .select(["primaryTitle", "titleType", "averageRating", "numVotes", (pl.col('averageRating') ** rating_preference * pl.col('numVotes') / 1000).alias("weightedRating")])
    .sort("weightedRating", descending=True)
    .limit(25)
)

top_few_movies_by_genres.collect()
Out[42]:
shape: (25, 5)
primaryTitletitleTypeaverageRatingnumVotesweightedRating
strstrf64i64f64
"The Dark Knight""movie"9.02705636219156.516
"Breaking Bad""tvSeries"9.51964594177304.6085
"Pulp Fiction""movie"8.92099696166316.92016
"The Godfather""movie"9.21899931160810.15984
"Se7en""movie"8.61689151124929.60796
……………
"Catch Me If You Can""movie"8.1101475266577.87872
"12 Angry Men""movie"9.080852565490.525
"Scarface""movie"8.386160559355.96845
"Snatch""movie"8.287093158561.40044
"A Clockwork Orange""movie"8.384486858202.95652
Genres vs Rating¶
In [43]:
genres_avg_rating = (
    df.lazy()
    .select([pl.col("genres"), pl.col("averageRating")])  
    .with_columns([pl.col("genres").str.split(",").alias("genres")]) 
    .explode("genres") 
    .group_by("genres")  
    .agg([
        pl.col("averageRating").mean().alias("avgRating"), 
        pl.len().alias("count"),
    ])
    .with_columns([
        (pl.col("avgRating") ** rating_preference * pl.col("count") / 1000).alias("weightedRating")
    ])
    .sort("weightedRating", descending=True)
)

genres_avg_rating.collect()
Out[43]:
shape: (29, 4)
genresavgRatingcountweightedRating
strf64u32f64
"Drama"7.04794444060421886.351802
"Comedy"6.99314841043320071.866867
"Documentary"7.2644981624388572.329017
"Action"7.0204031627598021.748551
"Animation"7.1062941567007913.257247
…………
"Adult"6.33805419635788.756304
"Western"6.95339915948771.081834
"War"7.02972614247704.044589
"Musical"6.63453812595554.395224
"Film-Noir"6.46757488236.893627
In [44]:
fig = px.bar(
    genres_avg_rating.collect(), 
    x="genres", 
    y="count", 
    title="Genres vs Rating", 
    labels={"genres": "Genres", "count": "Count"},
    color="avgRating",  # Color by avgRating
    color_continuous_scale="Viridis"  # Choose color scale
)

# Customize layout for better aesthetics
fig.update_layout(
    xaxis_title="Genres",
    yaxis_title="Count",
    xaxis_tickangle=-45,  # Rotate x-axis labels for readability
    template=theme  # Dark mode
)

fig.show()
In [45]:
rating_time = (
    df.lazy()
    .filter(pl.col("startYear").cast(pl.Int64).is_not_null())  # Filter out null values in startYear
    .with_columns([
        pl.col("startYear").cast(pl.Int64),  # Ensure startYear is an integer
    ])
    .group_by("startYear")  
    .agg([
        pl.len().alias("count"),
        pl.col("averageRating").mean().alias("avgRating"), 
        pl.col("numVotes").sum().alias("totalNumVotes"),
    ])
    .with_columns([
        (pl.col("avgRating") ** rating_preference * pl.col("totalNumVotes") / pl.col("count")).alias("weightedRating")  # Weighted rating calculation
    ])
    .sort("startYear", descending=False)  # Sort by startYear for time-based analysis
)

rating_time.collect()
Out[45]:
shape: (144, 5)
startYearcountavgRatingtotalNumVotesweightedRating
i64u32f64i64f64
187416.8183584850.4
187745.64963888.64
187836.033333360543742.001852
188125.669810944.64
188225.63014719.68
……………
2019582627.1357615175090845228.6017
2020526347.1099423041398529210.507123
2021536887.1349913914947737122.362334
2022495027.2879843705076039754.769032
2023126567.399802565916724484.78512
In [46]:
rating_time_df = rating_time.collect().to_pandas()

# Create the figure and add multiple traces
fig = go.Figure()

# Add line for Weighted Rating
fig.add_trace(go.Scatter(
    x=rating_time_df["startYear"],
    y=rating_time_df["weightedRating"],
    mode="lines",
    name="Weighted Rating",
    line=dict(color="blue")
))

# Add line for Total Number of Votes (scaled for visualization if needed)
fig.add_trace(go.Scatter(
    x=rating_time_df["startYear"],
    y=rating_time_df["totalNumVotes"],
    mode="lines",
    name="Total Number of Votes",
    line=dict(color="violet"),
    yaxis="y2"  # Using a secondary y-axis for totalNumVotes if needed for scale
))

# Set the layout
fig.update_layout(
    title="Weighted Ratings, Average Ratings, and Total Number of Votes Over Time",
    xaxis_title="Start Year",
    yaxis_title="Ratings",
    yaxis2=dict(title="Total Number of Votes", overlaying="y", side="right"),
    template=theme
)

# Show the plot
fig.show()
In [47]:
df_genres = (
    df.lazy()
    .filter(pl.col("genres").is_not_null())  
    .with_columns([pl.col("genres").str.split(",").alias("genre_list")])  
    .filter(pl.col("genre_list").len() > 1) 
)

df_genres.collect()
Out[47]:
shape: (1_286_401, 12)
tconsttitleTypeprimaryTitleoriginalTitleisAdultstartYearendYearruntimeMinutesgenresaverageRatingnumVotesgenre_list
strstrstrstri64i64stri64strf64i64list[str]
"tt0000001""short""Carmencita""Carmencita"01894null1"Documentary,Short"5.71966["Documentary", "Short"]
"tt0000002""short""Le clown et ses chiens""Le clown et ses chiens"01892null5"Animation,Short"5.8264["Animation", "Short"]
"tt0000003""short""Pauvre Pierrot""Pauvre Pierrot"01892null4"Animation,Comedy,Romance"6.51810["Animation", "Comedy", "Romance"]
"tt0000004""short""Un bon bock""Un bon bock"01892null12"Animation,Short"5.6178["Animation", "Short"]
"tt0000005""short""Blacksmith Scene""Blacksmith Scene"01893null1"Comedy,Short"6.22610["Comedy", "Short"]
………………………………
"tt9916708""tvEpisode""Horrid Henry Goes Gross""Horrid Henry Goes Gross"02012nullnull"Adventure,Animation,Comedy"8.66["Adventure", "Animation", "Comedy"]
"tt9916766""tvEpisode""Episode #10.15""Episode #10.15"02019null43"Family,Game-Show,Reality-TV"7.021["Family", "Game-Show", "Reality-TV"]
"tt9916778""tvEpisode""Escape""Escape"02019nullnull"Crime,Drama,Mystery"7.236["Crime", "Drama", "Mystery"]
"tt9916840""tvEpisode""Horrid Henry's Comic Caper""Horrid Henry's Comic Caper"02014null11"Adventure,Animation,Comedy"8.86["Adventure", "Animation", "Comedy"]
"tt9916880""tvEpisode""Horrid Henry Knows It All""Horrid Henry Knows It All"02014null10"Adventure,Animation,Comedy"8.26["Adventure", "Animation", "Comedy"]
In [48]:
df_genres = (
    df.lazy()
    .filter(pl.col("genres").is_not_null())  # Remove rows with null genres
    .with_columns([pl.col("genres").str.split(",").alias("genre_list")])  # Split genres into lists
    .explode("genre_list")  # Explode to have each genre in its own row
)

df_genres.collect()
Out[48]:
shape: (2_575_480, 12)
tconsttitleTypeprimaryTitleoriginalTitleisAdultstartYearendYearruntimeMinutesgenresaverageRatingnumVotesgenre_list
strstrstrstri64i64stri64strf64i64str
"tt0000001""short""Carmencita""Carmencita"01894null1"Documentary,Short"5.71966"Documentary"
"tt0000001""short""Carmencita""Carmencita"01894null1"Documentary,Short"5.71966"Short"
"tt0000002""short""Le clown et ses chiens""Le clown et ses chiens"01892null5"Animation,Short"5.8264"Animation"
"tt0000002""short""Le clown et ses chiens""Le clown et ses chiens"01892null5"Animation,Short"5.8264"Short"
"tt0000003""short""Pauvre Pierrot""Pauvre Pierrot"01892null4"Animation,Comedy,Romance"6.51810"Animation"
………………………………
"tt9916840""tvEpisode""Horrid Henry's Comic Caper""Horrid Henry's Comic Caper"02014null11"Adventure,Animation,Comedy"8.86"Animation"
"tt9916840""tvEpisode""Horrid Henry's Comic Caper""Horrid Henry's Comic Caper"02014null11"Adventure,Animation,Comedy"8.86"Comedy"
"tt9916880""tvEpisode""Horrid Henry Knows It All""Horrid Henry Knows It All"02014null10"Adventure,Animation,Comedy"8.26"Adventure"
"tt9916880""tvEpisode""Horrid Henry Knows It All""Horrid Henry Knows It All"02014null10"Adventure,Animation,Comedy"8.26"Animation"
"tt9916880""tvEpisode""Horrid Henry Knows It All""Horrid Henry Knows It All"02014null10"Adventure,Animation,Comedy"8.26"Comedy"
In [49]:
df_pairs = (
    df_genres
    .join(df_genres, on="tconst", suffix="_pair")  # Join on tconst to get all genre pairs within the same movie
    .filter(pl.col("genre_list") < pl.col("genre_list_pair"))  # Filter to avoid duplicate pairs (A, B) and (B, A)
    .select([
        pl.concat_str([pl.col("genre_list"), pl.col("genre_list_pair")], separator=", ").alias("genre_pair"),  # Combine pairs into a single string
        pl.col("averageRating"),
    ])
)

df_pairs.collect()
Out[49]:
shape: (1_742_968, 2)
genre_pairaverageRating
strf64
"Documentary, Short"5.7
"Animation, Short"5.8
"Animation, Comedy"6.5
"Animation, Romance"6.5
"Comedy, Romance"6.5
……
"Adventure, Comedy"8.8
"Animation, Comedy"8.8
"Adventure, Animation"8.2
"Adventure, Comedy"8.2
"Animation, Comedy"8.2
In [50]:
genre_pair_stats = (
    df_pairs
    .group_by("genre_pair")
    .agg([
        pl.col("averageRating").mean().alias("avgRating"),  # Calculate the average rating per genre pair
        pl.col("genre_pair").count().alias("count")  # Count of each genre pair
    ])
    .with_columns([
        (pl.col("avgRating") ** rating_preference * pl.col("count") / 1000).alias("weightedRating")  # Calculate a weighted rating
    ])
    # .sort("count", descending=True)  
)

genre_pair_stats.collect()
Out[50]:
shape: (362, 4)
genre_pairavgRatingcountweightedRating
strf64u32f64
"Music, Talk-Show"6.948446026290.94022
"Biography, News"7.4644861075.961885
"Comedy, Film-Noir"6.152632190.719243
"Action, Romance"6.5819473185137.980638
"Action, Family"6.8228083582166.744663
…………
"Animation, History"7.32470968836.91214
"History, Reality-TV"7.69142956033.128521
"Documentary, Game-Show"7.24155429615.522271
"Drama, History"7.27894114331759.299066
"Family, History"7.44418673140.509027
In [51]:
# Select the top genre pairs by count for better visualization
top_genre_pairs = genre_pair_stats.collect().sort("count", descending=True).head(20)

# Create a bar chart showing average rating and count for each genre pair
fig = px.bar(
    top_genre_pairs.to_pandas(),  # Convert Polars DataFrame to Pandas for compatibility with Plotly
    x="genre_pair",
    y="count",
    color="avgRating",
    title="Top 20 Genre Pair Combinations by Popularity and Average Rating",
    labels={"genre_pair": "Genre Pair", "count": "Number of Movies", "avgRating": "Average Rating"},
    color_continuous_scale="Viridis",
    template=theme
)

# Update layout for readability
fig.update_layout(
    xaxis_tickangle=-45,
    xaxis_title="Genre Pair",
    yaxis_title="Number of Movies",
    coloraxis_colorbar=dict(title="Average Rating"),
)

# Show the plot
fig.show()
In [52]:
rating_distribution = (
    df.lazy()
    .select([pl.col("genres"), pl.col("averageRating")])
    .with_columns(pl.col("genres").str.split(",").alias("genres"))  # Split genres into lists
    .explode("genres")  # Expand list to separate rows
    .group_by("genres")
    .agg([
        pl.col("averageRating").median().alias("medianRating"),
        pl.col("averageRating").quantile(0.25).alias("q1Rating"),  # 25th percentile
        pl.col("averageRating").quantile(0.75).alias("q3Rating"),  # 75th percentile
        pl.col("averageRating").std().alias("stdDevRating")  # Standard deviation
    ])
)

rating_distribution.collect()
Out[52]:
shape: (29, 5)
genresmedianRatingq1Ratingq3RatingstdDevRating
strf64f64f64f64
"Film-Noir"6.56.16.90.695806
"Comedy"7.26.37.91.33825
"Romance"7.06.07.91.376029
"Adult"6.45.57.21.328186
"Mystery"7.46.57.91.238499
……………
"Biography"7.36.68.01.205353
"War"7.26.37.91.252456
"Talk-Show"7.16.27.81.614222
"Game-Show"7.26.47.91.311489
"Thriller"6.65.47.61.566004
In [53]:
# Collect the data and convert to pandas DataFrame
rating_distribution_df = rating_distribution.collect().to_pandas()

# Create a box plot to show the rating distribution by genre
fig = px.box(
    rating_distribution_df,
    x="genres",
    y="medianRating",
    title="Rating Distribution by Genre (Median and Spread)",
    labels={"genres": "Genre", "medianRating": "Median Rating"},
    points="all"  # Show all data points for better spread indication
)

# Customize layout
fig.update_layout(
    xaxis_tickangle=-45,  # Rotate x-axis labels for readability
    yaxis_title="Rating Distribution",
    template=theme  # Change the theme if desired
)

# Show the plot
fig.show()
In [ ]: